require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
Hmisc::getHdata(nhgh)
d1 <- nhgh %>% 
  filter(sex == "female") %>% 
  filter(age >= 18) %>% 
  select(gh, ht) %>% 
  filter(1:n()<=1000)

1 Introduction

We want to figure out which kind of distribution fits the data well. In this blog, we choose 3 kinds of distribution, normal, gamma and weibull distribution. These three distribution are the most common distribution.

We use MM and MLE to calculate the parameters for each distribution and then make some plots to find out which distribution fits the data.

1.1 What is Method of Moments (MM)

Method of Moments:the method of moments is a method of estimation of population parameters.(From wikipedia)

  • the step of MM:(From professor’s slide)

1a. Choose a parametric distribution, \(F_X(x|\theta)\)

1b. Identify the distribution parameters, \(\theta\)

Calculate/find distribution moments (need as many moments as distribution parameters), \(E[X^k]\) or \(E[(X - E[X])^k]\)

Calculate sample moments.

Create system of equations equating distribution moments to sample moments.

Solve system of equations for distribution parameters.

\(\hat{F}_X = F_X(x | \theta = \hat{\theta})\)

1.2 What is Maximum Likelihood Estimation (MLE)

In statistics, maximum likelihood estimation (MLE) is a method of estimating the parameters of a probability distribution by maximizing a likelihood function, so that under the assumed statistical model the observed data is most probable. The point in the parameter space that maximizes the likelihood function is called the maximum likelihood estimate.(From wikipedia)

2 ht

2.1 MM

2.1.1 Estimates of parameters

mm.gamma.shape = mean(d1$ht)^2/var(d1$ht)
mm.gamma.scale=var(d1$ht)/mean(d1$h)

mm.norm.mean=mean(d1$ht)
mm.norm.sd=sd(d1$ht)

mean.weib = function(lambda, k){
  lambda*gamma(1 + 1/k)
}

lambda = function(samp.mean, k){
  samp.mean/gamma(1+1/k)
}

var.weib = function(samp.mean, k,samp.var){
  (lambda(samp.mean, k))^2*(gamma(1+2/k)-(gamma(1+1/k))^2)-samp.var
}

2.1.2 Overlay estimated pdf onto histogram

mm.opt = optimize(f = function(x){abs(var.weib(k = x, samp.mean = mean(d1$ht), samp.var = var(d1$ht)))},lower = 10, upper = 100)
mm.weib.k = mm.opt$minimum
mm.weib.lambda= lambda(samp.mean = mean(d1$ht), k=mm.weib.k)

hist(d1$ht, main = "Height of Adult Females: MM", breaks = 100, freq = FALSE)
curve(dgamma(x, shape = mm.gamma.shape, scale = mm.gamma.scale), add = TRUE, col = "red")
curve(dnorm(x, mm.norm.mean, mm.norm.sd), add = TRUE, col = "blue", lwd = 2)
curve(dweibull(x, shape = mm.weib.k, scale = mm.weib.lambda),add = TRUE, col = "green",lwd = 2)

  • the red line is gamma, the blue line is norm and the green line is weibull

2.1.3 Overlay estimated CDF onto eCDF

plot(ecdf(d1$ht),main = "The CDF and ECDF of ht: MM")
curve(pgamma(x, shape = mm.gamma.shape, scale = mm.gamma.scale), add = TRUE, col = "red",lwd = 6)
curve(pnorm(x, mm.norm.mean, mm.norm.sd), add = TRUE, col = "blue", lwd = 4)
curve(pweibull(x, shape = mm.weib.k, scale = mm.weib.lambda),add = TRUE, col = "green",lwd = 4)

  • the red line is gamma, the blue line is norm and the green line is weibull

2.1.4 QQ plot (sample vs estimated dist)

p = ppoints(300)
y = quantile(d1$ht, probs = p)
#gamma
x.gamma.mm = qgamma(p, shape = mm.gamma.shape, scale = mm.gamma.scale)
plot(x.gamma.mm,y,main = "Gamma QQplot")
abline(0,1)

#norm
x.norm.mm = qnorm(p, mm.norm.mean, mm.norm.sd)
plot(x.norm.mm,y,main = "Norm QQplot")
abline(0,1)

#weibull
x.pwei.mm=qweibull(p, shape = mm.weib.k, scale = mm.weib.lambda)
plot(x.pwei.mm,y,main = "Weibull QQplot")
abline(0,1)

2.1.5 Estimated Median

median(d1$ht)
## [1] 160.8
qweibull(0.5, shape = mm.weib.k, scale = mm.weib.lambda)
## [1] 161.8065
qgamma(0.5, shape = mm.gamma.shape, scale = mm.gamma.scale)
## [1] 160.6308
qnorm(0.5, mm.norm.mean, mm.norm.sd)
## [1] 160.7419
  • we could know that the median of ht is 160.8 , and the norm median is 160.7419, the gamma median is 160.630794, and the qweibull median is 161.8065244

2.1.6 Median Samp Dist (hist)

#gamma
med.gam = NA
for (i in 1:1000) {
  data = rgamma(200, shape = mm.gamma.shape, scale = mm.gamma.scale)
  med.gam[i] = median(data)
}
hist(med.gam,breaks = 100,main = "Histogram of median of gamma distribution")

#norm
med.norm = NA
for (i in 1:1000) {
  data = rnorm(200, mm.norm.mean, mm.norm.sd)
  med.norm[i] = median(data)
}
hist(med.norm,breaks = 100,main = "Histogram of median of normal distribution")

#
med.wei = NA
for (i in 1:1000) {
  data = rweibull(200, shape = mm.weib.k, scale = mm.weib.lambda)
  med.wei[i] = median(data)
}
hist(med.wei,breaks = 100,main = "Histogram of median of weibull distribution")

2.1.7 Range of middle 95% of Samp Dist

diff(quantile(med.gam,probs = c(0.025,0.975)))
##    97.5% 
## 2.415832
diff(quantile(med.norm,probs = c(0.025,0.975)))
##    97.5% 
## 2.602738
diff(quantile(med.wei,probs = c(0.025,0.975)))
##    97.5% 
## 2.396322
  • Range of middle 95% of Samp Dist of gamma is 2.4158317, norm is 2.6027384, and the weibull is 2.3963224

2.2 MLE

2.2.1 Estimates of parameters

2.2.1.1 Normal distribution

library(stats4)
nLL <- function(mean, sd){
  fs <- dnorm(
    x = d1$ht
    ,mean = mean
    ,sd = sd
    ,log = TRUE
  )
  -sum(fs)
}

fit <- mle(
  nLL
  ,start = list(mean = 160, sd = 5)
  ,method = "L-BFGS-B"
  ,lower = c(0,0.01)
)

2.2.1.2 Gamma distribution

nLL.gamma <- function(shape, scale){
  fs <- dgamma(
    x = d1$ht
    ,shape = shape
    ,scale = scale
    ,log = TRUE
  )
  -sum(fs)
}

fit.gamma <- mle(
  nLL.gamma
  ,start = list(shape = 1, scale = 1)
  ,method = "L-BFGS-B"
  ,lower = c(0,0.01)
)

2.2.1.3 Weibull distribution

nLL.weibull <- function(shape, scale){
  fs <- dweibull(
    x = d1$ht
    ,shape = shape
    ,scale = scale
    ,log = TRUE
  )
  -sum(fs)
}

fit.weibull <- mle(
  nLL.weibull
  ,start = list(shape = 1, scale = 1)
  ,method = "L-BFGS-B"
  ,lower = c(0,0.01)
)

2.2.2 Overlay estimated pdf onto histogram

2.2.2.1 Normal distribution

hist(d1$ht,breaks = 100, freq = FALSE,,main = "Overlay estimated pdf onto histogram of Normal distribution")
curve(dnorm(x,mean = coef(fit)[1],sd =coef(fit)[2]),col = "red" ,add = TRUE, lwd = 4)

2.2.2.2 Gammal distribution

hist(d1$ht,breaks = 100, freq = FALSE,,main = "Overlay estimated pdf onto histogram of Gamma distribution")
curve(dgamma(x,shape = coef(fit.gamma)[1],scale =coef(fit.gamma)[2]),col = "blue" ,add = TRUE, lwd = 4)

2.2.2.3 Weibull distribution

hist(d1$ht,breaks = 100, freq = FALSE,main = "Overlay estimated pdf onto histogram of Weibull distribution")
curve(dweibull(x,shape = coef(fit.weibull)[1],scale =coef(fit.weibull)[2]),col = "green" ,add = TRUE, lwd = 4)

2.2.3 Overlay estimated CDF onto eCDF

2.2.3.1 Normal distribution

plot(ecdf(d1$ht),,main = "Overlay estimated CDF onto eCDF of normal distribution")
curve(pnorm(x,mean = coef(fit)[1],sd =coef(fit)[2]),col = "red" ,add = TRUE, lwd = 4)

2.2.3.2 Gamma distribution

plot(ecdf(d1$ht),,main = "Overlay estimated CDF onto eCDF of gamma distribution")
curve(pgamma(x,shape = coef(fit.gamma)[1],scale =coef(fit.gamma)[2]),col = "blue" ,add = TRUE, lwd = 4)

2.2.3.3 Weibull distribution

plot(ecdf(d1$ht),,main = "Overlay estimated CDF onto eCDF of weibull distribution")
curve(pweibull(x,shape = coef(fit.weibull)[1],scale =coef(fit.weibull)[2]),col = "green" ,add = TRUE, lwd = 4)

2.2.4 QQ plot (sample vs estimated dist)

2.2.4.1 Normal distribution

qs <- seq(0.05,0.95,length = 100)
samp_qs <- quantile(d1$ht,qs)
theor_qs <- qnorm(qs,mean = coef(fit)[1],sd =coef(fit)[2])
plot(samp_qs,theor_qs)
abline(0,1)

2.2.4.2 Gamma distribution

qs <- seq(0.05,0.95,length = 100)
samp_qs.gamma <- quantile(d1$ht,qs)
theor_qs.gamma <- qgamma(qs,shape = coef(fit.gamma)[1],scale =coef(fit.gamma)[2])
plot(samp_qs.gamma,theor_qs.gamma)
abline(0,1)

2.2.4.3 Weibull distribution

qs <- seq(0.05,0.95,length = 100)
samp_qs.w <- quantile(d1$ht,qs)
theor_qs.w <- qweibull(qs,shape = coef(fit.weibull)[1],scale =coef(fit.weibull)[2])
plot(samp_qs.w,theor_qs.w)
abline(0,1)

2.2.5 Estimated Median

2.2.5.1 Normal distribution

qnorm(0.5,mean = coef(fit)[1],sd = coef(fit)[2])
## [1] 160.7419

2.2.5.2 Gamma distribution

qgamma(0.5,shape = coef(fit.gamma)[1],scale =coef(fit.gamma)[2])
## [1] 160.6312

2.2.5.3 Weibull distribution

qweibull(0.5,shape = coef(fit.weibull)[1],scale =coef(fit.weibull)[2])
## [1] 161.5156

2.2.6 Median Samp Dist (hist)

2.2.6.1 Normal distribution

M <- 500
N <- 1000
out <- rnorm(N*M,mean = coef(fit)[1],sd =coef(fit)[2])  %>%  array(dim = c(M,N))

samp_dist <- apply(out,1,median)
hist(samp_dist,breaks = 100,,main = "Median Samp Dist (hist) of normal distribution")

2.2.6.2 Gamma distribution

M <- 500
N <- 1000
out <- rgamma(N*M,shape = coef(fit.gamma)[1],scale =coef(fit.gamma)[2])  %>%  array(dim = c(M,N))

samp_dist.g <- apply(out,1,median)
hist(samp_dist.g,breaks = 100,main = "Median Samp Dist (hist) of gamma distribution")

2.2.6.3 Weibull distribution

M <- 500
N <- 1000
out <- rweibull(N*M,shape = coef(fit.weibull)[1],scale =coef(fit.weibull)[2])  %>%  array(dim = c(M,N))

samp_dist.w <- apply(out,1,median)
hist(samp_dist.w,breaks = 100,main = "Median Samp Dist (hist) of weibull distribution")

### Range of middle 95% of Samp Dist

diff(quantile(samp_dist,c(0.05/2,1-0.05/2)))
##    97.5% 
## 1.070803
diff(quantile(samp_dist.g,c(0.05/2,1-0.05/2)))
##    97.5% 
## 1.114164
diff(quantile(samp_dist.w,c(0.05/2,1-0.05/2)))
##    97.5% 
## 1.257542
  • Range of middle 95% of Samp Dist of gamma is 1.1141636, norm is 1.0708029, and the weibull is 1.2575425

3 gt

3.1 MM

3.1.1 Estimates of parameters

#gamma
mm.gamma.shape.g = mean(d1$gh)^2/var(d1$gh)
mm.gamma.scale.g=var(d1$gh)/mean(d1$gh)
#normal
mm.norm.mean.g=mean(d1$gh)
mm.norm.sd.g=sd(d1$gh)
#weibull
mean.weib.g = function(lambda, k){
  lambda*gamma(1 + 1/k)
}

lambda.g = function(samp.mean, k){
  samp.mean/gamma(1+1/k)
}

var.weib.g = function(samp.mean, k,samp.var){
  (lambda(samp.mean, k))^2*(gamma(1+2/k)-(gamma(1+1/k))^2)-samp.var
}

mm.opt.g = optimize(f = function(x){abs(var.weib(k = x, samp.mean = mean(d1$gh), samp.var = var(d1$gh)))},lower = 10, upper = 100)
mm.weib.k.g = mm.opt.g$minimum
mm.weib.lambda.g= lambda(samp.mean = mean(d1$gh), k=mm.weib.k)

3.1.2 Overlay estimated pdf onto histogram

hist(d1$gh, main = "Height of Adult Females: MM", breaks = 100, freq = FALSE,xlim = c(3,12))
curve(dgamma(x, shape = mm.gamma.shape.g, scale = mm.gamma.scale.g), add = TRUE, col = "red")
curve(dnorm(x, mm.norm.mean.g, mm.norm.sd.g), add = TRUE, col = "blue", lwd = 2)
curve(dweibull(x, shape = mm.weib.k.g, scale = mm.weib.lambda.g),add = TRUE, col = "green",lwd = 2)

  • the red line is gamma, the blue line is norm and the green line is weibull

3.1.3 Overlay estimated CDF onto eCDF

plot(ecdf(d1$gh),main = "The CDF and ECDF of ht: MM")
curve(pgamma(x, shape = mm.gamma.shape.g, scale = mm.gamma.scale.g), add = TRUE, col = "red",lwd = 6)
curve(pnorm(x, mm.norm.mean.g, mm.norm.sd.g), add = TRUE, col = "blue", lwd = 4)
curve(pweibull(x, shape = mm.weib.k.g, scale = mm.weib.lambda.g),add = TRUE, col = "green",lwd = 4)

  • the red line is gamma, the blue line is norm and the green line is weibull

3.1.4 QQ plot (sample vs estimated dist)

p = ppoints(3000)
y.g = quantile(d1$gh, probs = p)
#gamma
x.gamma.mm.g = qgamma(p, shape = mm.gamma.shape.g, scale = mm.gamma.scale.g)
plot(x.gamma.mm.g,y.g,main = "Gamma QQplot")
abline(0,1)

#norm
x.norm.mm.g = qnorm(p, mm.norm.mean.g, mm.norm.sd.g)
plot(x.norm.mm.g,y.g,main = "Norm QQplot")
abline(0,1)

#weibull
x.pwei.mm.g=qweibull(p, shape = mm.weib.k.g, scale = mm.weib.lambda.g)
plot(x.pwei.mm.g,y.g,main = "Weibull QQplot")
abline(0,1)

3.1.5 Estimated Median

median(d1$gh)
## [1] 5.5
qweibull(0.5, shape = mm.weib.k.g, scale = mm.weib.lambda.g)
## [1] 5.629781
qgamma(0.5, shape = mm.gamma.shape.g, scale = mm.gamma.scale.g)
## [1] 5.660259
qnorm(0.5, mm.norm.mean.g, mm.norm.sd.g)
## [1] 5.7246
  • we could know that the median of ht is 160.8 , and the norm median is 5.7246, the gamma median is 5.6602591, and the qweibull median is 5.6297806

3.1.6 Median Samp Dist (hist)

#gamma
med.gam.g = NA
for (i in 1:1000) {
  data = rgamma(200, shape = mm.gamma.shape.g, scale = mm.gamma.scale.g)
  med.gam.g[i] = median(data)
}
hist(med.gam.g,breaks = 100,main = "Histogram of median of gamma distribution")

#norm
med.norm.g = NA
for (i in 1:1000) {
  data = rnorm(200, mm.norm.mean.g, mm.norm.sd.g)
  med.norm.g[i] = median(data)
}
hist(med.norm.g,breaks = 100,main = "Histogram of median of normal distribution")

#
med.wei.g = NA
for (i in 1:1000) {
  data = rweibull(200, shape = mm.weib.k.g, scale = mm.weib.lambda.g)
  med.wei.g[i] = median(data)
}
hist(med.wei.g,breaks = 100,main = "Histogram of median of weibull distribution")

3.1.7 Range of middle 95% of Samp Dist

diff(quantile(med.gam.g,probs = c(0.025,0.975)))
##     97.5% 
## 0.3623804
diff(quantile(med.norm.g,probs = c(0.025,0.975)))
##     97.5% 
## 0.3743458
diff(quantile(med.wei.g,probs = c(0.025,0.975)))
##     97.5% 
## 0.2178256
  • Range of middle 95% of Samp Dist of gamma is 0.3623804, norm is 0.3743458, and the weibull is 0.2178256

3.2 MLE

3.2.1 Estimates of parameters

3.2.1.1 Normal distribution

library(stats4)
nLL.g <- function(mean, sd){
  fs <- dnorm(
    x = d1$gh
    ,mean = mean
    ,sd = sd
    ,log = TRUE
  )
  -sum(fs)
}

fit.g <- mle(
  nLL.g
  ,start = list(mean = 1, sd = 1)
  ,method = "L-BFGS-B"
  ,lower = c(0,0.01)
)

3.2.1.2 Gamma distribution

nLL.gamma.g <- function(shape, scale){
  fs <- dgamma(
    x = d1$gh
    ,shape = shape
    ,scale = scale
    ,log = TRUE
  )
  -sum(fs)
}

fit.gamma.g <- mle(
  nLL.gamma.g
  ,start = list(shape = 1, scale = 1)
  ,method = "L-BFGS-B"
  ,lower = c(0,0.01)
)

3.2.1.3 Weibull distribution

nLL.weibull.g <- function(shape, scale){
  fs <- dweibull(
    x = d1$gh
    ,shape = shape
    ,scale = scale
    ,log = TRUE
  )
  -sum(fs)
}

fit.weibull.g <- mle(
  nLL.weibull.g
  ,start = list(shape = 1, scale = 1)
  ,method = "L-BFGS-B"
  ,lower = c(0,0.01)
)

3.2.2 Overlay estimated pdf onto histogram

3.2.2.1 Normal distribution

hist(d1$gh,breaks = 100, freq = FALSE,xlim = c(4,12),main = "Overlay estimated pdf onto histogram of Normal distribution")
curve(dnorm(x,mean = coef(fit.g)[1],sd =coef(fit.g)[2]),col = "red" ,add = TRUE, lwd = 4)

3.2.2.2 Gammal distribution

hist(d1$gh,breaks = 100, freq = FALSE,,main = "Overlay estimated pdf onto histogram of Gamma distribution")
curve(dgamma(x,shape = coef(fit.gamma.g)[1],scale =coef(fit.gamma.g)[2]),col = "blue" ,add = TRUE, lwd = 4)

3.2.2.3 Weibull distribution

hist(d1$gh,breaks = 100, freq = FALSE,main = "Overlay estimated pdf onto histogram of Weibull distribution")
curve(dweibull(x,shape = coef(fit.weibull.g)[1],scale =coef(fit.weibull.g)[2]),col = "green" ,add = TRUE, lwd = 4)

3.2.3 Overlay estimated CDF onto eCDF

3.2.3.1 Normal distribution

plot(ecdf(d1$gh),,main = "Overlay estimated CDF onto eCDF of normal distribution")
curve(pnorm(x,mean = coef(fit.g)[1],sd =coef(fit.g)[2]),col = "red" ,add = TRUE, lwd = 4)

3.2.3.2 Gamma distribution

plot(ecdf(d1$gh),,main = "Overlay estimated CDF onto eCDF of gamma distribution")
curve(pgamma(x,shape = coef(fit.gamma.g)[1],scale =coef(fit.gamma.g)[2]),col = "blue" ,add = TRUE, lwd = 4)

3.2.3.3 Weibull distribution

plot(ecdf(d1$gh),,main = "Overlay estimated CDF onto eCDF of weibull distribution")
curve(pweibull(x,shape = coef(fit.weibull.g)[1],scale =coef(fit.weibull.g)[2]),col = "green" ,add = TRUE, lwd = 4)

3.2.4 QQ plot (sample vs estimated dist)

3.2.4.1 Normal distribution

qs <- seq(0.05,0.95,length = 5000)
samp_qs.g <- quantile(d1$gh,qs)
theor_qs.g <- qnorm(qs,mean = coef(fit.g)[1],sd =coef(fit.g)[2])
plot(samp_qs.g,theor_qs.g)
abline(0,1)

3.2.4.2 Gamma distribution

qs <- seq(0.05,0.95,length = 5000)
samp_qs.gamma.g <- quantile(d1$gh,qs)
theor_qs.gamma.g <- qgamma(qs,shape = coef(fit.gamma.g)[1],scale =coef(fit.gamma.g)[2])
plot(samp_qs.gamma.g,theor_qs.gamma.g)
abline(0,1)

3.2.4.3 Weibull distribution

qs <- seq(0.05,0.95,length = 100)
samp_qs.w.g <- quantile(d1$ht,qs)
theor_qs.w.g <- qweibull(qs,shape = coef(fit.weibull.g)[1],scale =coef(fit.weibull.g)[2])
plot(samp_qs.w.g,theor_qs.w.g)
abline(0,1)

3.2.5 Estimated Median

3.2.5.1 Normal distribution

qnorm(0.5,mean = coef(fit.g)[1],sd = coef(fit.g)[2])
## [1] 5.7246

3.2.5.2 Gamma distribution

qgamma(0.5,shape = coef(fit.gamma.g)[1],scale =coef(fit.gamma.g)[2])
## [1] 5.677983

3.2.5.3 Weibull distribution

qweibull(0.5,shape = coef(fit.weibull.g)[1],scale =coef(fit.weibull.g)[2])
## [1] 5.64902

3.2.6 Median Samp Dist (hist)

3.2.6.1 Normal distribution

M <- 500
N <- 1000
out.g <- rnorm(N*M,mean = coef(fit.g)[1],sd =coef(fit.g)[2])  %>%  array(dim = c(M,N))

samp_dist.g.g <- apply(out.g,1,median)
hist(samp_dist.g.g,breaks = 100,,main = "Median Samp Dist (hist) of normal distribution")

3.2.6.2 Gamma distribution

M <- 500
N <- 1000
out.gamma.g <- rgamma(N*M,shape = coef(fit.gamma.g)[1],scale =coef(fit.gamma.g)[2])  %>%  array(dim = c(M,N))

samp_dist.gamma.g <- apply(out.gamma.g,1,median)
hist(samp_dist.gamma.g,breaks = 100,main = "Median Samp Dist (hist) of gamma distribution")

3.2.6.3 Weibull distribution

M <- 500
N <- 1000
out.w.g <- rweibull(N*M,shape = coef(fit.weibull.g)[1],scale =coef(fit.weibull.g)[2])  %>%  array(dim = c(M,N))

samp_dist.w.g <- apply(out.w.g,1,median)
hist(samp_dist.w.g,breaks = 100,main = "Median Samp Dist (hist) of weibull distribution")

3.2.7 Range of middle 95% of Samp Dist

diff(quantile(samp_dist.g.g,c(0.05/2,1-0.05/2)))
##    97.5% 
## 0.150214
diff(quantile(samp_dist.gamma.g,c(0.05/2,1-0.05/2)))
##     97.5% 
## 0.1338429
diff(quantile(samp_dist.w.g,c(0.05/2,1-0.05/2)))
##     97.5% 
## 0.2222051
  • Range of middle 95% of Samp Dist of gamma is 0.1338429, norm is 0.150214, and the weibull is 0.2222051

4 Conclusion

4.1 ht

According to Plots, normal and gamma distribution could fit the data in both MM and MLE, the weibull distribution might could not fit the data very well in MM and MLE. But the weibull distribution might fit the data better in MLE than MM.

4.2 gh

According to Plots, none of the distribution could fit the data very well.